From 9c53f1d66a3ff0bf2cb4bdf2166b83c78c7f276a Mon Sep 17 00:00:00 2001 From: "iap10@labyrinth.cl.cam.ac.uk" Date: Fri, 19 Mar 2004 23:43:48 +0000 Subject: [PATCH] bitkeeper revision 1.794.1.1 (405b85b44Vh_3MMuChrmhJ9H5nxbyw) basic shadow support --- .rootkeys | 2 + BitKeeper/etc/ignore | 10 + xen/arch/i386/process.c | 20 +- xen/arch/i386/traps.c | 18 +- xen/common/debug.c | 8 +- xen/common/domain.c | 14 + xen/common/domain_page.c | 2 + xen/common/kernel.c | 29 +- xen/common/memory.c | 98 ++++- xen/common/shadow.c | 618 +++++++++++++++++++++++++++++++ xen/include/asm-i386/config.h | 10 +- xen/include/asm-i386/page.h | 1 + xen/include/asm-i386/processor.h | 6 + xen/include/xeno/mm.h | 5 + xen/include/xeno/perfc_defn.h | 11 +- xen/include/xeno/shadow.h | 48 +++ xen/net/dev.c | 49 ++- 17 files changed, 929 insertions(+), 20 deletions(-) create mode 100644 xen/common/shadow.c create mode 100644 xen/include/xeno/shadow.h diff --git a/.rootkeys b/.rootkeys index e078d7cc52..d28d693e5d 100644 --- a/.rootkeys +++ b/.rootkeys @@ -158,6 +158,7 @@ 4006e659i9j-doVxY7DKOGU4XVin1Q xen/common/rbtree.c 3ddb79bdHqdQpATqC0rmUZNbsb6L6A xen/common/resource.c 3e397e6619PgAfBbw2XFbXkewvUWgw xen/common/schedule.c +405b8599xI_PoEr3zZoJ2on-jdn7iw xen/common/shadow.c 3ddb79bdB9RNMnkQnUyZ5C9hhMSQQw xen/common/slab.c 3ddb79bd0gVQYmL2zvuJnldvD0AGxQ xen/common/softirq.c 3e7f358awXBC3Vw-wFRwPw18qL1khg xen/common/string.c @@ -552,6 +553,7 @@ 3e4540ccU1sgCx8seIMGlahmMfv7yQ xen/include/xeno/reboot.h 3ddb79c0LzqqS0LhAQ50ekgj4oGl7Q xen/include/xeno/sched.h 403a06a7H0hpHcKpAiDe5BPnaXWTlA xen/include/xeno/serial.h +405b8599BsDsDwKEJLS0XipaiQW3TA xen/include/xeno/shadow.h 3ddb79c0VDeD-Oft5eNfMneTU3D1dQ xen/include/xeno/skbuff.h 3ddb79c14dXIhP7C2ahnoD08K90G_w xen/include/xeno/slab.h 3ddb79c09xbS-xxfKxuV3JETIhBzmg xen/include/xeno/smp.h diff --git a/BitKeeper/etc/ignore b/BitKeeper/etc/ignore index 2a2f79ebac..7a0065247b 100644 --- a/BitKeeper/etc/ignore +++ b/BitKeeper/etc/ignore @@ -548,3 +548,13 @@ tools/xentrace/xentrace tools/xc/lib/xc_evtchn.o tools/xc/py/XenoUtil.pyc tools/xend/xend +tools/xc/lib/libxc.so.1.3 +tools/xc/lib/libxc.so.1.3.0 +tools/xc/lib/xc_physdev.o +tools/xend/xend_utils.o +xen/common/physdev.o +xen/common/shadow.o +xen/common/trace.o +xen/drivers/char/console.o +xen/drivers/char/keyboard.o +xen/include/hypervisor-ifs/arch diff --git a/xen/arch/i386/process.c b/xen/arch/i386/process.c index 09170307a7..8ed1cf2dc2 100644 --- a/xen/arch/i386/process.c +++ b/xen/arch/i386/process.c @@ -32,6 +32,7 @@ #include #include +#include int hlt_counter; @@ -281,7 +282,24 @@ void switch_to(struct task_struct *prev_p, struct task_struct *next_p) } /* Switch page tables. */ - write_cr3_counted(pagetable_val(next_p->mm.pagetable)); +#ifdef CONFIG_SHADOW + + /* printk("switch_to %08lx, %08lx\n", next_p->mm.pagetable, + next_p->mm.shadowtable);*/ + + + if( next_p->mm.shadowmode ) + { + write_cr3_counted(pagetable_val(next_p->mm.shadowtable)); + check_pagetable( next_p->mm.pagetable, "switch" ); + } + else +#endif + write_cr3_counted(pagetable_val(next_p->mm.pagetable)); + + + + set_current(next_p); diff --git a/xen/arch/i386/traps.c b/xen/arch/i386/traps.c index f71ce60d57..717ca6d2cb 100644 --- a/xen/arch/i386/traps.c +++ b/xen/arch/i386/traps.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -323,6 +324,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs, long error_code) __asm__ __volatile__ ("movl %%cr2,%0" : "=r" (addr) : ); + perfc_incrc(page_faults); + if ( unlikely(addr >= LDT_VIRT_START) && (addr < (LDT_VIRT_START + (p->mm.ldt_ents*LDT_ENTRY_SIZE))) ) { @@ -336,6 +339,18 @@ asmlinkage void do_page_fault(struct pt_regs *regs, long error_code) return; /* successfully copied the mapping */ } +#ifdef CONFIG_SHADOW +//printk("1"); +check_pagetable( current->mm.pagetable, "pre-sf" ); + if ( p->mm.shadowmode && addr < PAGE_OFFSET && + shadow_fault( addr, error_code ) ) + { + check_pagetable( current->mm.pagetable, "post-sfa" ); + return; // return true if fault was handled + } + check_pagetable( current->mm.pagetable, "post-sfb" ); +#endif + if ( unlikely(!(regs->xcs & 3)) ) goto fault_in_hypervisor; @@ -353,7 +368,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs, long error_code) if ( likely((fixup = search_exception_table(regs->eip)) != 0) ) { - DPRINTK("Page fault: %08lx -> %08lx\n", regs->eip, fixup); + perfc_incrc(copy_user_faults); + //DPRINTK("copy_user fault: %08lx -> %08lx\n", regs->eip, fixup); regs->eip = fixup; regs->xds = regs->xes = regs->xfs = regs->xgs = __HYPERVISOR_DS; return; diff --git a/xen/common/debug.c b/xen/common/debug.c index dff739d99a..4e298bbfb5 100644 --- a/xen/common/debug.c +++ b/xen/common/debug.c @@ -91,7 +91,13 @@ int pdb_change_values(domid_t domain, u_char *buffer, unsigned long addr, if ((addr >> PAGE_SHIFT) == ((addr + length - 1) >> PAGE_SHIFT)) { - l2_table = map_domain_mem(pagetable_val(p->mm.pagetable)); +#ifdef CONFIG_SHADOW + if (p->mm.shadowmode ) + l2_table = map_domain_mem(pagetable_val(p->mm.shadowtable)); + else +#endif + l2_table = map_domain_mem(pagetable_val(p->mm.pagetable)); + l2_table += l2_table_offset(addr); if (!(l2_pgentry_val(*l2_table) & _PAGE_PRESENT)) { diff --git a/xen/common/domain.c b/xen/common/domain.c index 53cea06285..c63c9164e3 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -546,6 +547,10 @@ int final_setup_guestos(struct task_struct *p, dom0_builddomain_t *builddomain) get_page_and_type(&frame_table[phys_l2tab>>PAGE_SHIFT], p, PGT_l2_page_table); +#ifdef CONFIG_SHADOW + p->mm.shadowtable = shadow_mk_pagetable(phys_l2tab, p->mm.shadowmode); +#endif + /* Set up the shared info structure. */ update_dom_time(p->shared_info); @@ -847,6 +852,15 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params, set_bit(PF_CONSTRUCTED, &p->flags); +#ifdef CONFIG_SHADOW + +printk("Engage shadow mode for dom 0\n"); + p->mm.shadowmode = SHM_test; // XXXXX IAP + p->mm.shadowtable = shadow_mk_pagetable(phys_l2tab, p->mm.shadowmode ); +#endif + + + new_thread(p, (unsigned long)virt_load_address, (unsigned long)virt_stack_address, diff --git a/xen/common/domain_page.c b/xen/common/domain_page.c index 5e5974562a..723d7e33e3 100644 --- a/xen/common/domain_page.c +++ b/xen/common/domain_page.c @@ -45,6 +45,8 @@ void *map_domain_mem(unsigned long pa) unsigned long *cache = mapcache; unsigned long flags; + perfc_incrc(map_domain_mem_count); + spin_lock_irqsave(&map_lock, flags); /* Has some other CPU caused a wrap? We must flush if so. */ diff --git a/xen/common/kernel.c b/xen/common/kernel.c index b963c6f5e9..1737c72204 100644 --- a/xen/common/kernel.c +++ b/xen/common/kernel.c @@ -104,6 +104,7 @@ void cmain(unsigned long magic, multiboot_info_t *mbi) module_t *mod; void *heap_start; int i; + unsigned long frametable_pages, max_mem; /* Parse the command-line options. */ cmdline = (unsigned char *)(mbi->cmdline ? __va(mbi->cmdline) : NULL); @@ -190,22 +191,36 @@ void cmain(unsigned long magic, multiboot_info_t *mbi) for ( ; ; ) ; } - /* The array of pfn_info structures must fit into the reserved area. */ - if ( sizeof(struct pfn_info) > 24 ) + frametable_pages = ((FRAMETABLE_VIRT_END - RDWR_MPT_VIRT_START)/sizeof(struct pfn_info)); + + if ( frametable_pages < (1<<(32-PAGE_SHIFT)) ) { - printk("'struct pfn_info' too large to fit in Xen address space!\n"); - for ( ; ; ) ; + printk("Not enough space to initialise frame table for a 4GB machine (%luMB only)\n", frametable_pages >> (20-PAGE_SHIFT)); } set_current(&idle0_task); - max_page = (mbi->mem_upper+1024) >> (PAGE_SHIFT - 10); + max_mem = max_page = (mbi->mem_upper+1024) >> (PAGE_SHIFT - 10); + + if ( max_page > frametable_pages ) + max_page = frametable_pages; + init_frametable(max_page); - printk("Initialised all memory on a %luMB machine\n", - max_page >> (20-PAGE_SHIFT)); + printk("Initialised %luMB memory on a %luMB machine\n", + max_page >> (20-PAGE_SHIFT), + max_mem >> (20-PAGE_SHIFT) ); heap_start = memguard_init(&_end); + printk("Xen heap size is %luKB\n", + (MAX_MONITOR_ADDRESS-__pa(heap_start))/1024 ); + + if ( ((MAX_MONITOR_ADDRESS-__pa(heap_start))/1024) <= 4096 ) + { + printk("Xen heap size is too small to safely continue!\n"); + for ( ; ; ) ; + } + init_page_allocator(__pa(heap_start), MAX_MONITOR_ADDRESS); /* Initialise the slab allocator. */ diff --git a/xen/common/memory.c b/xen/common/memory.c index 32acc0ac11..e672f78fe0 100644 --- a/xen/common/memory.c +++ b/xen/common/memory.c @@ -133,6 +133,7 @@ #include #include #include +#include #include #include #include @@ -182,6 +183,7 @@ static struct { struct task_struct *subject_p; } percpu_info[NR_CPUS] __cacheline_aligned; + /* * init_frametable: * Initialise per-frame memory information. This goes directly after @@ -768,6 +770,13 @@ void free_page_type(struct pfn_info *page, unsigned int type) default: BUG(); } + +#ifdef CONFIG_SHADOW + // assume we're in shadow mode if PSH_shadowed set + if ( page->shadow_and_flags & PSH_shadowed ) + unshadow_table( page-frame_table ); +#endif + } @@ -832,6 +841,10 @@ static int do_extended_command(unsigned long ptr, unsigned long val) put_page_and_type(&frame_table[pagetable_val(current->mm.pagetable) >> PAGE_SHIFT]); current->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT); +#ifdef CONFIG_SHADOW + current->mm.shadowtable = + shadow_mk_pagetable(pfn << PAGE_SHIFT, current->mm.shadowmode); +#endif invalidate_shadow_ldt(); percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB; } @@ -917,6 +930,10 @@ int do_mmu_update(mmu_update_t *ureqs, int count) struct pfn_info *page; int rc = 0, okay = 1, i, cpu = smp_processor_id(); unsigned int cmd; +#ifdef CONFIG_SHADOW + unsigned long prev_spfn = 0; + l1_pgentry_t *prev_spl1e = 0; +#endif perfc_incrc(calls_to_mmu_update); perfc_addc(num_page_updates, count); @@ -967,6 +984,13 @@ int do_mmu_update(mmu_update_t *ureqs, int count) { okay = mod_l1_entry((l1_pgentry_t *)va, mk_l1_pgentry(req.val)); + +#ifdef CONFIG_SHADOW + if ( okay && page->shadow_and_flags & PSH_shadowed ) + shadow_l1_normal_pt_update( req.ptr, req.val, + &prev_spfn, &prev_spl1e ); +#endif + put_page_type(page); } break; @@ -976,6 +1000,11 @@ int do_mmu_update(mmu_update_t *ureqs, int count) okay = mod_l2_entry((l2_pgentry_t *)va, mk_l2_pgentry(req.val), pfn); +#ifdef CONFIG_SHADOW + if ( okay && page->shadow_and_flags & PSH_shadowed ) + shadow_l2_normal_pt_update( req.ptr, req.val ); +#endif + put_page_type(page); } break; @@ -985,9 +1014,19 @@ int do_mmu_update(mmu_update_t *ureqs, int count) *(unsigned long *)va = req.val; okay = 1; put_page_type(page); + +#ifdef CONFIG_SHADOW + if ( page->shadow_and_flags & PSH_shadowed ) + BUG(); + // at present, we shouldn't be shadowing such pages +#endif + + } break; } + +check_pagetable( current->mm.pagetable, "mmu" ); // XXX XXX XXX XXX XXX put_page(page); @@ -1031,11 +1070,23 @@ int do_mmu_update(mmu_update_t *ureqs, int count) if ( prev_pfn != 0 ) unmap_domain_mem((void *)va); +#ifdef CONFIG_SHADOW + if( prev_spl1e != 0 ) + unmap_domain_mem((void *)prev_spl1e); +#endif + deferred_ops = percpu_info[cpu].deferred_ops; percpu_info[cpu].deferred_ops = 0; if ( deferred_ops & DOP_FLUSH_TLB ) - write_cr3_counted(pagetable_val(current->mm.pagetable)); + { +#ifdef CONFIG_SHADOW + if ( unlikely(current->mm.shadowmode) ) + write_cr3_counted(pagetable_val(current->mm.shadowtable)); + else +#endif + write_cr3_counted(pagetable_val(current->mm.pagetable)); + } if ( deferred_ops & DOP_RELOAD_LDT ) (void)map_ldt_shadow_page(0); @@ -1059,19 +1110,62 @@ int do_update_va_mapping(unsigned long page_nr, unsigned int cpu = p->processor; unsigned long deferred_ops; + perfc_incrc(calls_to_update_va); + if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) ) return -EINVAL; + // XXX when we make this support 4MB pages we should also + // deal with the case of updating L2s + if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr], mk_l1_pgentry(val))) ) err = -EINVAL; +#ifdef CONFIG_SHADOW + + if ( unlikely(p->mm.shadowmode) ) + { + unsigned long sval = 0; + + // XXX this only works for l1 entries, with no translation + + if ( (val & _PAGE_PRESENT) && (val & _PAGE_ACCESSED) ) + { + sval = val; + if ( !(val & _PAGE_DIRTY) ) + sval &= ~_PAGE_RW; + } + + /* printk("update_va_map: page_nr=%08lx val =%08lx sval =%08lx\n", + page_nr, val, sval);*/ + + if ( __put_user( sval, ((unsigned long *) (&shadow_linear_pg_table[page_nr])) ) ) + { + // Since L2's are guranteed RW, failure indicates the page + // was not shadowed, so ignore. + + //MEM_LOG("update_va_map: couldn't write update\n"); + } + } + +check_pagetable( p->mm.pagetable, "va" ); + +#endif + deferred_ops = percpu_info[cpu].deferred_ops; percpu_info[cpu].deferred_ops = 0; if ( unlikely(deferred_ops & DOP_FLUSH_TLB) || unlikely(flags & UVMF_FLUSH_TLB) ) - write_cr3_counted(pagetable_val(p->mm.pagetable)); + { +#ifdef CONFIG_SHADOW + if ( unlikely(p->mm.shadowmode) ) + write_cr3_counted(pagetable_val(p->mm.shadowtable)); + else +#endif + write_cr3_counted(pagetable_val(p->mm.pagetable)); + } else if ( unlikely(flags & UVMF_INVLPG) ) __flush_tlb_one(page_nr << PAGE_SHIFT); diff --git a/xen/common/shadow.c b/xen/common/shadow.c new file mode 100644 index 0000000000..7756b7dada --- /dev/null +++ b/xen/common/shadow.c @@ -0,0 +1,618 @@ +/* -*- Mode:C; c-basic-offset:4; tab-width:4 -*- */ + +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_SHADOW + + +#if 1 +#define MEM_VLOG(_f, _a...) \ + printk("DOM%llu: (file=shadow.c, line=%d) " _f "\n", \ + current->domain , __LINE__ , ## _a ) +#else +#define MEM_VLOG(_f, _a...) +#endif + +#if 0 +#define MEM_VVLOG(_f, _a...) \ + printk("DOM%llu: (file=shadow.c, line=%d) " _f "\n", \ + current->domain , __LINE__ , ## _a ) +#else +#define MEM_VVLOG(_f, _a...) +#endif + + +/******** + +To use these shadow page tables, guests must not rely on the ACCESSED +and DIRTY bits on L2 pte's being accurate -- they will typically all be set. + +I doubt this will break anything. (If guests want to use the va_update +mechanism they've signed up for this anyhow...) + +********/ + + +pagetable_t shadow_mk_pagetable( unsigned long gptbase, + unsigned int shadowmode ) +{ + unsigned long gpfn, spfn=0; + + MEM_VVLOG("shadow_mk_pagetable( gptbase=%08lx, mode=%d )", + gptbase, shadowmode ); + + if ( unlikely(shadowmode) ) + { + gpfn = gptbase >> PAGE_SHIFT; + + if ( likely(frame_table[gpfn].shadow_and_flags & PSH_shadowed) ) + { + spfn = frame_table[gpfn].shadow_and_flags & PSH_pfn_mask; + } + else + { + spfn = shadow_l2_table( gpfn ); + } + } + + return mk_pagetable(spfn << PAGE_SHIFT); +} + +void unshadow_table( unsigned long gpfn ) +{ + unsigned long spfn; + +MEM_VLOG("unshadow_table %08lx\n", gpfn ); + + perfc_incrc(unshadow_table_count); + + // this function is the same for both l1 and l2 tables + + // even in the SMP guest case, there won't be a race here as + // this CPU was the one that cmpxchg'ed the page to invalid + + spfn = frame_table[gpfn].shadow_and_flags & PSH_pfn_mask; + frame_table[gpfn].shadow_and_flags=0; + frame_table[spfn].shadow_and_flags=0; + +#ifdef DEBUG + { // XXX delete me! + int i; + unsigned long * spl1e = map_domain_mem( spfn<> L2_PAGETABLE_SHIFT] = + mk_l2_pgentry((gpfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); + spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = + mk_l2_pgentry((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); + spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] = + mk_l2_pgentry(__pa(frame_table[gpfn].u.domain->mm.perdomain_pt) | + __PAGE_HYPERVISOR); + + // can't use the linear map as we may not be in the right PT + gpl2e = (l2_pgentry_t *) map_domain_mem(gpfn << PAGE_SHIFT); + + // proactively create entries for pages that are already shadowed + for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) + { + unsigned long spte = 0; + +#if 0 // Turns out this doesn't really help + unsigned long gpte; + + gpte = l2_pgentry_val(gpl2e[i]); + + if (gpte & _PAGE_PRESENT) + { + unsigned long s_sh = + frame_table[ gpte>>PAGE_SHIFT ].shadow_and_flags; + + if( s_sh & PSH_shadowed ) // PSH_shadowed + { + if ( unlikely( (frame_table[gpte>>PAGE_SHIFT].type_and_flags & PGT_type_mask) == PGT_l2_page_table) ) + { + printk("Linear mapping detected\n"); + spte = gpte & ~_PAGE_RW; + } + else + { + spte = ( gpte & ~PAGE_MASK ) | (s_sh< %08lx)",gpfn,spfn); + + + return spfn; +} + + +int shadow_fault( unsigned long va, long error_code ) +{ + unsigned long gpte, spte; + + MEM_VVLOG("shadow_fault( va=%08lx, code=%ld )", va, error_code ); + + if ( unlikely(__get_user(gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) ) + { + MEM_VVLOG("shadow_fault - EXIT: read gpte faulted" ); + return 0; // propagate to guest + } + + if ( ! (gpte & _PAGE_PRESENT) ) + { + MEM_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte ); + return 0; // we're not going to be able to help + } + + spte = gpte; + + if ( error_code & 2 ) + { // write fault + if ( gpte & _PAGE_RW ) + { + gpte |= _PAGE_DIRTY | _PAGE_ACCESSED; + spte |= _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED; + // (we're about to dirty it anyhow...) + } + else + { // write fault on RO page + MEM_VVLOG("shadow_fault - EXIT: write fault on RO page (%lx)",gpte ); + return 0; // propagate to guest + // not clear whether we should set accessed bit here... + } + } + else + { + gpte |= _PAGE_ACCESSED; + spte |= _PAGE_ACCESSED; // about to happen anyway + if ( ! (gpte & _PAGE_DIRTY) ) + spte &= ~_PAGE_RW; // force clear unless already dirty + } + + MEM_VVLOG("plan: gpte=%08lx spte=%08lx", gpte, spte ); + + // write back updated gpte + // XXX watch out for read-only L2 entries! (not used in Linux) + if ( unlikely( __put_user( gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) ) + BUG(); // fixme! + + if ( unlikely( __put_user( spte, (unsigned long*)&shadow_linear_pg_table[va>>PAGE_SHIFT])) ) + { + // failed: + // the L1 may not be shadowed, or the L2 entry may be insufficient + + unsigned long gpde, spde, gl1pfn, sl1pfn; + + MEM_VVLOG("3: not shadowed or l2 insufficient gpte=%08lx spte=%08lx",gpte,spte ); + + gpde = l2_pgentry_val(linear_l2_table[va>>L2_PAGETABLE_SHIFT]); + + gl1pfn = gpde>>PAGE_SHIFT; + + if ( ! (frame_table[gl1pfn].shadow_and_flags & PSH_shadowed ) ) + { + // this L1 is NOT already shadowed so we need to shadow it + struct pfn_info *sl1pfn_info; + unsigned long *gpl1e, *spl1e; + int i; + sl1pfn_info = alloc_domain_page( NULL ); // XXX account properly! + sl1pfn = sl1pfn_info - frame_table; + + MEM_VVLOG("4a: l1 not shadowed ( %08lx )",sl1pfn); + perfc_incrc(shadow_l1_table_count); + + sl1pfn_info->shadow_and_flags = PSH_shadow | gl1pfn; + frame_table[gl1pfn].shadow_and_flags = PSH_shadowed | sl1pfn; + + gpde = gpde | _PAGE_ACCESSED | _PAGE_DIRTY; + spde = (gpde & ~PAGE_MASK) | _PAGE_RW | (sl1pfn<>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(gpde); + shadow_linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(spde); + + gpl1e = (unsigned long *) &(linear_pg_table[ + (va>>PAGE_SHIFT) & ~(ENTRIES_PER_L1_PAGETABLE-1) ]); + + spl1e = (unsigned long *) &shadow_linear_pg_table[ + (va>>PAGE_SHIFT) & ~(ENTRIES_PER_L1_PAGETABLE-1) ]; + + + // XXX can only do this is the shadow/guest is writeable + // disable write protection if ! gpde & _PAGE_RW ???? + + for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) + { +#if SHADOW_OPTIMISE + if ( (gpl1e[i] & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == + (_PAGE_PRESENT|_PAGE_ACCESSED) ) + { + spl1e[i] = gpl1e[i]; + if ( !(gpl1e[i] & _PAGE_DIRTY) ) + spl1e[i] &= ~_PAGE_RW; + } + else +#endif + spl1e[i] = 0; + } + + + } + else + { + // this L1 was shadowed (by another PT) but we didn't have an L2 + // entry for it + + sl1pfn = frame_table[gl1pfn].shadow_and_flags & PSH_pfn_mask; + + MEM_VVLOG("4b: was shadowed, l2 missing ( %08lx )",sl1pfn); + + spde = (gpde & ~PAGE_MASK) | (sl1pfn<>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(gpde); + shadow_linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(spde); + + + } + + shadow_linear_pg_table[va>>PAGE_SHIFT] = mk_l1_pgentry(spte); + // (we need to do the above even if we've just made the shadow L1) + + } // end of fixup writing the shadow L1 directly failed + + perfc_incrc(shadow_fixup_count); + + return 1; // let's try the faulting instruction again... + +} + + +void shadow_l1_normal_pt_update( unsigned long pa, unsigned long gpte, + unsigned long *prev_spfn_ptr, + l1_pgentry_t **prev_spl1e_ptr ) +{ + unsigned long gpfn, spfn, spte, prev_spfn = *prev_spfn_ptr; + l1_pgentry_t * spl1e, * prev_spl1e = *prev_spl1e_ptr; + + +MEM_VVLOG("shadow_l1_normal_pt_update pa=%08lx, gpte=%08lx, prev_spfn=%08lx, prev_spl1e=%08lx\n", +pa,gpte,prev_spfn, prev_spl1e); + + // to get here, we know the l1 page *must* be shadowed + + gpfn = pa >> PAGE_SHIFT; + spfn = frame_table[gpfn].shadow_and_flags & PSH_pfn_mask; + + if ( spfn == prev_spfn ) + { + spl1e = prev_spl1e; + } + else + { + if( prev_spl1e ) unmap_domain_mem( prev_spl1e ); + spl1e = (l1_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT ); + *prev_spfn_ptr = spfn; + *prev_spl1e_ptr = spl1e; + } + // XXX we assume only pagetables can be shadowed; this will have to change + // to allow arbitrary CoW etc. + + spte = 0; + +#if SHADOW_OPTIMISE + if ( (gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == + (_PAGE_PRESENT|_PAGE_ACCESSED) ) + { + spte = gpte; + if ( !(gpte & _PAGE_DIRTY ) ) + gpte &= ~ _PAGE_RW; + } +#endif + + spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t) ] = + mk_l1_pgentry( spte ); + + unmap_domain_mem( (void *) spl1e ); +} + +void shadow_l2_normal_pt_update( unsigned long pa, unsigned long gpte ) +{ + unsigned long gpfn, spfn, spte; + l2_pgentry_t * sp2le; + unsigned long s_sh; + + MEM_VVLOG("shadow_l2_normal_pt_update pa=%08lx, gpte=%08lx",pa,gpte); + + // to get here, we know the l2 page has a shadow + + gpfn = pa >> PAGE_SHIFT; + spfn = frame_table[gpfn].shadow_and_flags & PSH_pfn_mask; + + sp2le = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT ); + // no real need for a cache here + + spte = 0; + + s_sh = frame_table[gpte >> PAGE_SHIFT].shadow_and_flags; + + if ( s_sh ) // PSH_shadowed + { + if ( unlikely( (frame_table[gpte>>PAGE_SHIFT].type_and_flags & PGT_type_mask) == PGT_l2_page_table) ) + { + // linear page table case + spte = (gpte & ~_PAGE_RW) | _PAGE_DIRTY | _PAGE_ACCESSED; + } + else + spte = (gpte & ~PAGE_MASK) | (s_sh<>PAGE_SHIFT; + gpfn = gpte>>PAGE_SHIFT; + + if ( gpfn == spfn ) + { + if ( level > 1 ) + FAIL("Linear map ???"); // XXX this will fail on BSD + +#if 0 // might be a RO mapping of a page table page + if ( frame_table[gpfn].shadow_and_flags != 0 ) + { + FAIL("Should have been shadowed g.sf=%08lx s.sf=%08lx", + frame_table[gpfn].shadow_and_flags, + frame_table[spfn].shadow_and_flags); + } + else +#endif + return 1; + } + else + { + if ( level < 2 ) + FAIL("Shadow in L1 entry?"); + + if ( frame_table[gpfn].shadow_and_flags != (PSH_shadowed | spfn) ) + FAIL("spfn problem g.sf=%08lx s.sf=%08lx [g.sf]=%08lx [s.sf]=%08lx", + frame_table[gpfn].shadow_and_flags, + frame_table[spfn].shadow_and_flags, + frame_table[frame_table[gpfn].shadow_and_flags&PSH_pfn_mask].shadow_and_flags, + frame_table[frame_table[spfn].shadow_and_flags&PSH_pfn_mask].shadow_and_flags + ); + + if ( frame_table[spfn].shadow_and_flags != (PSH_shadow | gpfn) ) + FAIL("gpfn problem g.sf=%08lx s.sf=%08lx", + frame_table[gpfn].shadow_and_flags, + frame_table[spfn].shadow_and_flags); + + } + + return 1; +} + + +int check_l1_table( unsigned long va, unsigned long g2, unsigned long s2 ) +{ + int j; + unsigned long *gpl1e, *spl1e; + + gpl1e = (unsigned long *) &(linear_pg_table[ va>>PAGE_SHIFT]); + spl1e = (unsigned long *) &(shadow_linear_pg_table[ va>>PAGE_SHIFT]); + + + for ( j = 0; j < ENTRIES_PER_L1_PAGETABLE; j++ ) + { + unsigned long gpte = gpl1e[j]; + unsigned long spte = spl1e[j]; + + check_pte( gpte, spte, 1, j ); + } + + return 1; +} + +#define FAILPT(_f, _a...) \ +{printk("XXX FAILPT" _f "\n", ## _a ); BUG();} + +int check_pagetable( pagetable_t pt, char *s ) +{ + unsigned long gptbase = pagetable_val(pt); + unsigned long gpfn, spfn; + int i; + l2_pgentry_t *gpl2e, *spl2e; + +return 1; + + sh_check_name = s; + + MEM_VVLOG("%s-PT Audit",s); + + sh_l2_present = sh_l1_present = 0; + + gpfn = gptbase >> PAGE_SHIFT; + + if ( ! (frame_table[gpfn].shadow_and_flags & PSH_shadowed) ) + { + printk("%s-PT %08lx not shadowed\n", s, gptbase); + + if( frame_table[gpfn].shadow_and_flags != 0 ) BUG(); + + return 0; + } + + spfn = frame_table[gpfn].shadow_and_flags & PSH_pfn_mask; + + if ( ! frame_table[gpfn].shadow_and_flags == (PSH_shadowed | spfn) ) + FAILPT("ptbase shadow inconsistent1"); + + if ( ! frame_table[spfn].shadow_and_flags == (PSH_shadow | gpfn) ) + FAILPT("ptbase shadow inconsistent2"); + + + // use the linear map to get a pointer to the L2 + gpl2e = (l2_pgentry_t *) &(linear_l2_table[0]); + spl2e = (l2_pgentry_t *) &(shadow_linear_l2_table[0]); + + // check the whole L2 + for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) + { + unsigned long gpte = l2_pgentry_val(gpl2e[i]); + unsigned long spte = l2_pgentry_val(spl2e[i]); + + check_pte( gpte, spte, 2, i ); + } + + + // go back and recurse + for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) + { + unsigned long gpte = l2_pgentry_val(gpl2e[i]); + unsigned long spte = l2_pgentry_val(spl2e[i]); + + if ( spte ) + check_l1_table( + i<>PAGE_SHIFT, spte>>PAGE_SHIFT ); + + } + + + MEM_VVLOG("PT verified : l2_present = %d, l1_present = %d\n", + sh_l2_present, sh_l1_present ); + + return 1; +} + + +#endif + + +#endif // CONFIG_SHADOW + + + diff --git a/xen/include/asm-i386/config.h b/xen/include/asm-i386/config.h index 3dd2986492..0496f481d9 100644 --- a/xen/include/asm-i386/config.h +++ b/xen/include/asm-i386/config.h @@ -40,6 +40,9 @@ #define CONFIG_XEN_ATTENTION_KEY 1 +#define CONFIG_SHADOW 1 + + #define HZ 100 /* @@ -68,7 +71,7 @@ */ #define MAX_MONITOR_ADDRESS (16*1024*1024) #define MAX_DMA_ADDRESS (16*1024*1024) -#define MAX_DIRECTMAP_ADDRESS (44*1024*1024) +#define MAX_DIRECTMAP_ADDRESS (40*1024*1024) // XXX was 44 /* And the virtual addresses for the direct-map region... */ #define DIRECTMAP_VIRT_START (READONLY_MPT_VIRT_END) #define DIRECTMAP_VIRT_END (DIRECTMAP_VIRT_START + MAX_DIRECTMAP_ADDRESS) @@ -81,8 +84,11 @@ /* Next 4MB of virtual address space is used as a linear p.t. mapping. */ #define LINEAR_PT_VIRT_START (DIRECTMAP_VIRT_END) #define LINEAR_PT_VIRT_END (LINEAR_PT_VIRT_START + (4*1024*1024)) +/* Next 4MB of virtual address space is used as a shadow linear p.t. map. */ +#define SH_LINEAR_PT_VIRT_START (LINEAR_PT_VIRT_END) +#define SH_LINEAR_PT_VIRT_END (SH_LINEAR_PT_VIRT_START + (4*1024*1024)) /* Next 4MB of virtual address space used for per-domain mappings (eg. GDT). */ -#define PERDOMAIN_VIRT_START (LINEAR_PT_VIRT_END) +#define PERDOMAIN_VIRT_START (SH_LINEAR_PT_VIRT_END) #define PERDOMAIN_VIRT_END (PERDOMAIN_VIRT_START + (4*1024*1024)) #define GDT_VIRT_START (PERDOMAIN_VIRT_START) #define GDT_VIRT_END (GDT_VIRT_START + (64*1024)) diff --git a/xen/include/asm-i386/page.h b/xen/include/asm-i386/page.h index a4339d64dd..64b5cf73a0 100644 --- a/xen/include/asm-i386/page.h +++ b/xen/include/asm-i386/page.h @@ -91,6 +91,7 @@ typedef struct { unsigned long pt_lo; } pagetable_t; #include #define linear_pg_table ((l1_pgentry_t *)LINEAR_PT_VIRT_START) +#define linear_l2_table ((l2_pgentry_t *)(LINEAR_PT_VIRT_START+(LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT-L1_PAGETABLE_SHIFT)))) extern l2_pgentry_t idle_pg_table[ENTRIES_PER_L2_PAGETABLE]; extern void paging_init(void); diff --git a/xen/include/asm-i386/processor.h b/xen/include/asm-i386/processor.h index c7df85aa28..9766ac7b20 100644 --- a/xen/include/asm-i386/processor.h +++ b/xen/include/asm-i386/processor.h @@ -415,6 +415,12 @@ struct mm_struct { */ l1_pgentry_t *perdomain_pt; pagetable_t pagetable; + +#ifdef CONFIG_SHADOW + unsigned int shadowmode; /* flags to control shadow table operation */ + pagetable_t shadowtable; +#endif + /* Current LDT details. */ unsigned long ldt_base, ldt_ents, shadow_ldt_mapcnt; /* Next entry is passed to LGDT on domain switch. */ diff --git a/xen/include/xeno/mm.h b/xen/include/xeno/mm.h index 0774571a73..c1df341a28 100644 --- a/xen/include/xeno/mm.h +++ b/xen/include/xeno/mm.h @@ -67,6 +67,10 @@ struct pfn_info unsigned long type_and_flags; /* Timestamp from 'TLB clock', used to reduce need for safety flushes. */ unsigned long tlbflush_timestamp; +#ifdef CONFIG_SHADOW + /* Shadow page status: top bits flags, bottom bits are a pfn */ + unsigned long shadow_and_flags; +#endif }; /* The following page types are MUTUALLY EXCLUSIVE. */ @@ -100,6 +104,7 @@ struct pfn_info /* 28-bit count of references to this frame. */ #define PGC_count_mask ((1<<28)-1) + /* We trust the slab allocator in slab.c, and our use of it. */ #define PageSlab(page) (1) #define PageSetSlab(page) ((void)0) diff --git a/xen/include/xeno/perfc_defn.h b/xen/include/xeno/perfc_defn.h index f81b5bcba1..0475b6371e 100644 --- a/xen/include/xeno/perfc_defn.h +++ b/xen/include/xeno/perfc_defn.h @@ -19,6 +19,15 @@ PERFCOUNTER_CPU( need_flush_tlb_flush, "PG_need_flush tlb flushes" ) PERFCOUNTER_CPU( calls_to_mmu_update, "calls_to_mmu_update" ) PERFCOUNTER_CPU( num_page_updates, "num_page_updates" ) - +PERFCOUNTER_CPU( calls_to_update_va, "calls_to_update_va_map" ) +PERFCOUNTER_CPU( page_faults, "page faults" ) +PERFCOUNTER_CPU( copy_user_faults, "copy_user faults" ) +PERFCOUNTER_CPU( map_domain_mem_count, "map_domain_mem count" ) + +PERFCOUNTER_CPU( shadow_l2_table_count, "shadow_l2_table count" ) +PERFCOUNTER_CPU( shadow_l1_table_count, "shadow_l1_table count" ) +PERFCOUNTER_CPU( unshadow_table_count, "unshadow_table count" ) +PERFCOUNTER_CPU( shadow_fixup_count, "shadow_fixup count" ) +PERFCOUNTER_CPU( shadow_update_va_fail, "shadow_update_va_fail" ) diff --git a/xen/include/xeno/shadow.h b/xen/include/xeno/shadow.h new file mode 100644 index 0000000000..dca0126784 --- /dev/null +++ b/xen/include/xeno/shadow.h @@ -0,0 +1,48 @@ +/* -*- Mode:C; c-basic-offset:4; tab-width:4 -*- */ + +#ifndef _XENO_SHADOW_H +#define _XENO_SHADOW_H + +#ifdef CONFIG_SHADOW + +#include +#include +#include + +/* Shadow PT flag bits in pfn_info */ +#define PSH_shadowed (1<<31) /* page has a shadow. PFN points to shadow */ +#define PSH_shadow (1<<30) /* page is a shadow. PFN points to orig page */ +#define PSH_pending (1<<29) /* page is in the process of being shadowed */ +#define PSH_pfn_mask ((1<<21)-1) + +/* Shadow PT operation mode : shadowmode variable in mm_struct */ +#define SHM_test (1<<0) /* just run domain on shadow PTs */ +#define SHM_logdirty (1<<1) /* log pages that are dirtied */ +#define SHM_cow (1<<2) /* copy on write all dirtied pages */ +#define SHM_translate (1<<3) /* lookup machine pages in translation table */ + +#define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START) +#define shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START+(SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT-L1_PAGETABLE_SHIFT)))) + +extern pagetable_t shadow_mk_pagetable( unsigned long gptbase, unsigned int shadowmode ); +extern void unshadow_table( unsigned long gpfn ); +extern unsigned long shadow_l2_table( unsigned long gpfn ); +extern int shadow_fault( unsigned long va, long error_code ); +extern void shadow_l1_normal_pt_update( unsigned long pa, unsigned long gpte, + unsigned long *prev_spfn_ptr, + l1_pgentry_t **prev_spl1e_ptr ); +extern void shadow_l2_normal_pt_update( unsigned long pa, unsigned long gpte ); + + +#define SHADOW_DEBUG 0 +#define SHADOW_OPTIMISE 1 + +#if SHADOW_DEBUG +extern int check_pagetable( pagetable_t pt, char *s ); +#else +#define check_pagetable( pt, s ) +#endif + + +#endif +#endif diff --git a/xen/net/dev.c b/xen/net/dev.c index fbd9be63c9..bb25e6a2b9 100644 --- a/xen/net/dev.c +++ b/xen/net/dev.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -488,7 +489,7 @@ struct netif_rx_stats netdev_rx_stat[NR_CPUS]; void deliver_packet(struct sk_buff *skb, net_vif_t *vif) { rx_shadow_entry_t *rx; - unsigned long *ptep, pte; + unsigned long *ptep, pte, new_pte; struct pfn_info *old_page, *new_page, *pte_page; unsigned short size; unsigned char offset, status = RING_STATUS_OK; @@ -530,10 +531,12 @@ void deliver_packet(struct sk_buff *skb, net_vif_t *vif) wmb(); /* Get type count and set flush bit before updating PTE. */ pte = *ptep; + + new_pte = (pte & ~PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT | + ((new_page - frame_table) << PAGE_SHIFT); + if ( unlikely(pte & _PAGE_PRESENT) || - unlikely(cmpxchg(ptep, pte, - (pte & ~PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT | - ((new_page - frame_table) << PAGE_SHIFT))) != pte ) + unlikely(cmpxchg(ptep, pte, new_pte)) != pte ) { DPRINTK("PTE was modified or reused! %08lx %08lx\n", pte, *ptep); unmap_domain_mem(ptep); @@ -543,6 +546,22 @@ void deliver_packet(struct sk_buff *skb, net_vif_t *vif) goto out; } + +#ifdef CONFIG_SHADOW + if ( pte_page->shadow_and_flags & PSH_shadowed ) + { + unsigned long spte_pfn = pte_page->shadow_and_flags & PSH_pfn_mask; + unsigned long *sptr = map_domain_mem( (spte_pfn<> PAGE_SHIFT; pte_page = &frame_table[pte_pfn]; + + //printk("MMM %08lx ", rx.addr); /* The address passed down must be to a valid PTE. */ if ( unlikely(pte_pfn >= max_page) || @@ -2081,7 +2102,7 @@ static void get_rx_bufs(net_vif_t *vif) ptep = map_domain_mem(rx.addr); pte = *ptep; - + //printk("%08lx\n",pte); /* We must be passed a valid writeable mapping to swizzle. */ if ( unlikely((pte & (_PAGE_PRESENT|_PAGE_RW)) != (_PAGE_PRESENT|_PAGE_RW)) || @@ -2092,6 +2113,22 @@ static void get_rx_bufs(net_vif_t *vif) make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0); goto rx_unmap_and_continue; } + +#ifdef CONFIG_SHADOW + { + if ( frame_table[rx.addr>>PAGE_SHIFT].shadow_and_flags & PSH_shadowed ) + { + unsigned long spfn = + frame_table[rx.addr>>PAGE_SHIFT].shadow_and_flags & PSH_pfn_mask; + unsigned long * sptr = map_domain_mem( (spfn<> PAGE_SHIFT; buf_page = &frame_table[buf_pfn]; @@ -2112,6 +2149,8 @@ static void get_rx_bufs(net_vif_t *vif) put_page_and_type(pte_page); make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0); goto rx_unmap_and_continue; + + // XXX IAP should SHADOW_CONFIG do something here? } /* -- 2.30.2